{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['this’s a sent tokenize test.',\n",
       " 'this is sent two.',\n",
       " 'is this sent three?',\n",
       " 'sent 4 is cool!',\n",
       " 'Now it’s your turn.']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk.data\n",
    "text = 'this’s a sent tokenize test. this is sent two. is this sent three? sent 4 is cool! Now it’s your turn.'\n",
    "tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')\n",
    "tokenizer.tokenize(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Hola amigo.', 'Estoy bien.']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')\n",
    "spanish_tokenizer.tokenize('Hola amigo. Estoy bien.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache C:\\Users\\user\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 2.219 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Full Mode: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学\n",
      "Default Mode: 我/ 来到/ 北京/ 清华大学\n"
     ]
    }
   ],
   "source": [
    "#中文用结吧分词\n",
    "import jieba\n",
    "seg_list = jieba.cut(\"我来到北京清华大学\",cut_all=True)\n",
    "print (\"Full Mode:\", \"/ \".join(seg_list)) #全模式\n",
    "\n",
    "seg_list = jieba.cut(\"我来到北京清华大学\",cut_all=False)\n",
    "print (\"Default Mode:\", \"/ \".join(seg_list)) #精确模式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Hello', 'World', '.']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "word_tokenize('Hello World.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Dive',\n",
       " 'into',\n",
       " 'NLTK',\n",
       " ':',\n",
       " 'Part-of-speech',\n",
       " 'tagging',\n",
       " 'and',\n",
       " 'POS',\n",
       " 'Tagger']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = nltk.word_tokenize('Dive into NLTK: Part-of-speech tagging and POS Tagger')\n",
    "text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Dive', 'NNP'),\n",
       " ('into', 'IN'),\n",
       " ('NLTK', 'NNP'),\n",
       " (':', ':'),\n",
       " ('Part-of-speech', 'JJ'),\n",
       " ('tagging', 'NN'),\n",
       " ('and', 'CC'),\n",
       " ('POS', 'NNP'),\n",
       " ('Tagger', 'NNP')]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.pos_tag(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NNP: noun, proper, singular\n",
      "    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos\n",
      "    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA\n",
      "    Shannon A.K.C. Meltex Liverpool ...\n"
     ]
    }
   ],
   "source": [
    "#安装tagset\n",
    "nltk.help.upenn_tagset('NNP')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3914"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#安装treebank\n",
    "from nltk.corpus import treebank\n",
    "len(treebank.tagged_sents())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Pierre', 'NNP'),\n",
       " ('Vinken', 'NNP'),\n",
       " (',', ','),\n",
       " ('61', 'CD'),\n",
       " ('years', 'NNS'),\n",
       " ('old', 'JJ'),\n",
       " (',', ','),\n",
       " ('will', 'MD'),\n",
       " ('join', 'VB'),\n",
       " ('the', 'DT'),\n",
       " ('board', 'NN'),\n",
       " ('as', 'IN'),\n",
       " ('a', 'DT'),\n",
       " ('nonexecutive', 'JJ'),\n",
       " ('director', 'NN'),\n",
       " ('Nov.', 'NNP'),\n",
       " ('29', 'CD'),\n",
       " ('.', '.')]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = treebank.tagged_sents()[:3000]\n",
    "test_data = treebank.tagged_sents()[3000:]\n",
    "train_data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.875674508957479"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.tag import tnt\n",
    "tnt_pos_tagger = tnt.TnT()\n",
    "tnt_pos_tagger.train(train_data)\n",
    "tnt_pos_tagger.evaluate(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "f = open('tnt_treebank_pos_tagger.pickle', 'wb+')\n",
    "pickle.dump(tnt_pos_tagger, f)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('this', 'DT'),\n",
       " ('is', 'VBZ'),\n",
       " ('a', 'DT'),\n",
       " ('tnt', 'Unk'),\n",
       " ('treebank', 'Unk'),\n",
       " ('tnt', 'Unk'),\n",
       " ('tagger', 'Unk')]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tnt_pos_tagger.tag(nltk.word_tokenize('this is a tnt treebank tnt tagger'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
